import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier ,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC,LinearSVC
a=pd.read_csv(r'C:\Users\Sridevi\Downloads\city_day1.csv')
print(a)
City Date PM2.5 PM10 NO NO2 NOx NH3 \
0 Ahmedabad 1/1/2015 NaN NaN 0.92 18.22 17.15 NaN
1 Ahmedabad 1/2/2015 NaN NaN 0.97 15.69 16.46 NaN
2 Ahmedabad 1/3/2015 NaN NaN 17.40 19.30 29.70 NaN
3 Ahmedabad 1/4/2015 NaN NaN 1.70 18.48 17.97 NaN
4 Ahmedabad 1/5/2015 NaN NaN 22.10 21.42 37.76 NaN
... ... ... ... ... ... ... ... ...
14870 Visakhapatnam 6/27/2020 15.02 50.94 7.68 25.06 19.54 12.47
14871 Visakhapatnam 6/28/2020 24.38 74.09 3.42 26.06 16.53 11.99
14872 Visakhapatnam 6/29/2020 22.91 65.73 3.45 29.53 18.33 10.71
14873 Visakhapatnam 6/30/2020 16.64 49.97 4.05 29.26 18.80 10.03
14874 Visakhapatnam 7/1/2020 15.00 66.00 0.40 26.85 14.05 5.20
CO SO2 O3 Benzene Toluene Xylene AQI AQI_Bucket
0 0.92 27.64 133.36 0.00 0.02 0.00 NaN NaN
1 0.97 24.55 34.06 3.68 5.50 3.77 NaN NaN
2 17.40 29.07 30.70 6.80 16.40 2.25 NaN NaN
3 1.70 18.59 36.08 4.43 10.14 1.00 NaN NaN
4 22.10 39.33 39.31 7.01 18.89 2.78 NaN NaN
... ... ... ... ... ... ... ... ...
14870 0.47 8.55 23.30 2.24 12.07 0.73 41.0 Good
14871 0.52 12.72 30.14 0.74 2.21 0.38 70.0 Satisfactory
14872 0.48 8.42 30.96 0.01 0.01 0.00 68.0 Satisfactory
14873 0.52 9.84 28.30 0.00 0.00 0.00 54.0 Satisfactory
14874 0.59 2.10 17.05 NaN NaN NaN 50.0 Good
[14875 rows x 16 columns]
a.isnull().sum()
City 0 Date 0 PM2.5 2526 PM10 5645 NO 2232 NO2 2223 NOx 1183 NH3 5009 CO 717 SO2 2330 O3 2482 Benzene 1575 Toluene 1991 Xylene 7410 AQI 2666 AQI_Bucket 2666 dtype: int64
a.shape
(14875, 16)
a.size
238000
del a['Xylene']
del a['PM10']
del a['NH3']
a
| City | Date | PM2.5 | NO | NO2 | NOx | CO | SO2 | O3 | Benzene | Toluene | AQI | AQI_Bucket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Ahmedabad | 1/1/2015 | NaN | 0.92 | 18.22 | 17.15 | 0.92 | 27.64 | 133.36 | 0.00 | 0.02 | NaN | NaN |
| 1 | Ahmedabad | 1/2/2015 | NaN | 0.97 | 15.69 | 16.46 | 0.97 | 24.55 | 34.06 | 3.68 | 5.50 | NaN | NaN |
| 2 | Ahmedabad | 1/3/2015 | NaN | 17.40 | 19.30 | 29.70 | 17.40 | 29.07 | 30.70 | 6.80 | 16.40 | NaN | NaN |
| 3 | Ahmedabad | 1/4/2015 | NaN | 1.70 | 18.48 | 17.97 | 1.70 | 18.59 | 36.08 | 4.43 | 10.14 | NaN | NaN |
| 4 | Ahmedabad | 1/5/2015 | NaN | 22.10 | 21.42 | 37.76 | 22.10 | 39.33 | 39.31 | 7.01 | 18.89 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14870 | Visakhapatnam | 6/27/2020 | 15.02 | 7.68 | 25.06 | 19.54 | 0.47 | 8.55 | 23.30 | 2.24 | 12.07 | 41.0 | Good |
| 14871 | Visakhapatnam | 6/28/2020 | 24.38 | 3.42 | 26.06 | 16.53 | 0.52 | 12.72 | 30.14 | 0.74 | 2.21 | 70.0 | Satisfactory |
| 14872 | Visakhapatnam | 6/29/2020 | 22.91 | 3.45 | 29.53 | 18.33 | 0.48 | 8.42 | 30.96 | 0.01 | 0.01 | 68.0 | Satisfactory |
| 14873 | Visakhapatnam | 6/30/2020 | 16.64 | 4.05 | 29.26 | 18.80 | 0.52 | 9.84 | 28.30 | 0.00 | 0.00 | 54.0 | Satisfactory |
| 14874 | Visakhapatnam | 7/1/2020 | 15.00 | 0.40 | 26.85 | 14.05 | 0.59 | 2.10 | 17.05 | NaN | NaN | 50.0 | Good |
14875 rows × 13 columns
a.columns
Index(['City', 'Date', 'PM2.5', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3',
'Benzene', 'Toluene', 'AQI', 'AQI_Bucket'],
dtype='object')
a.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 14875 entries, 0 to 14874 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 City 14875 non-null object 1 Date 14875 non-null object 2 PM2.5 12349 non-null float64 3 NO 12643 non-null float64 4 NO2 12652 non-null float64 5 NOx 13692 non-null float64 6 CO 14158 non-null float64 7 SO2 12545 non-null float64 8 O3 12393 non-null float64 9 Benzene 13300 non-null float64 10 Toluene 12884 non-null float64 11 AQI 12209 non-null float64 12 AQI_Bucket 12209 non-null object dtypes: float64(10), object(3) memory usage: 1.5+ MB
a.describe()
| PM2.5 | NO | NO2 | NOx | CO | SO2 | O3 | Benzene | Toluene | AQI | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 12349.000000 | 12643.000000 | 12652.000000 | 13692.000000 | 14158.000000 | 12545.000000 | 12393.000000 | 13300.000000 | 12884.000000 | 12209.000000 |
| mean | 59.148547 | 18.721084 | 34.743467 | 35.887924 | 3.207676 | 15.199771 | 36.293217 | 3.196163 | 10.071038 | 171.151118 |
| std | 52.944778 | 23.601786 | 26.419470 | 33.602578 | 9.471177 | 20.573747 | 21.384305 | 8.937824 | 16.859380 | 167.429820 |
| min | 1.720000 | 0.060000 | 0.010000 | 0.000000 | 0.000000 | 0.480000 | 0.020000 | 0.000000 | 0.000000 | 20.000000 |
| 25% | 27.740000 | 6.040000 | 16.690000 | 15.737500 | 0.480000 | 5.580000 | 21.390000 | 0.170000 | 0.500000 | 80.000000 |
| 50% | 44.730000 | 10.130000 | 28.295000 | 25.460000 | 0.870000 | 8.880000 | 33.120000 | 1.340000 | 4.080000 | 113.000000 |
| 75% | 69.990000 | 20.860000 | 44.570000 | 44.455000 | 1.400000 | 15.730000 | 46.880000 | 3.820000 | 12.170000 | 195.000000 |
| max | 685.360000 | 270.090000 | 292.020000 | 467.630000 | 175.810000 | 193.860000 | 257.730000 | 391.880000 | 411.520000 | 2049.000000 |
a.nunique()
City 10 Date 2009 PM2.5 7393 NO 4357 NO2 6012 NOx 6307 CO 1622 SO2 3525 O3 5850 Benzene 1547 Toluene 3167 AQI 799 AQI_Bucket 6 dtype: int64
a.mean()
C:\Users\Sridevi\AppData\Local\Temp\ipykernel_12428\1798845826.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction. a.mean()
PM2.5 59.148547 NO 18.721084 NO2 34.743467 NOx 35.887924 CO 3.207676 SO2 15.199771 O3 36.293217 Benzene 3.196163 Toluene 10.071038 AQI 171.151118 dtype: float64
a.fillna(a.mean())
C:\Users\Sridevi\AppData\Local\Temp\ipykernel_12428\3867590745.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction. a.fillna(a.mean())
| City | Date | PM2.5 | NO | NO2 | NOx | CO | SO2 | O3 | Benzene | Toluene | AQI | AQI_Bucket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Ahmedabad | 1/1/2015 | 59.148547 | 0.92 | 18.22 | 17.15 | 0.92 | 27.64 | 133.36 | 0.000000 | 0.020000 | 171.151118 | NaN |
| 1 | Ahmedabad | 1/2/2015 | 59.148547 | 0.97 | 15.69 | 16.46 | 0.97 | 24.55 | 34.06 | 3.680000 | 5.500000 | 171.151118 | NaN |
| 2 | Ahmedabad | 1/3/2015 | 59.148547 | 17.40 | 19.30 | 29.70 | 17.40 | 29.07 | 30.70 | 6.800000 | 16.400000 | 171.151118 | NaN |
| 3 | Ahmedabad | 1/4/2015 | 59.148547 | 1.70 | 18.48 | 17.97 | 1.70 | 18.59 | 36.08 | 4.430000 | 10.140000 | 171.151118 | NaN |
| 4 | Ahmedabad | 1/5/2015 | 59.148547 | 22.10 | 21.42 | 37.76 | 22.10 | 39.33 | 39.31 | 7.010000 | 18.890000 | 171.151118 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14870 | Visakhapatnam | 6/27/2020 | 15.020000 | 7.68 | 25.06 | 19.54 | 0.47 | 8.55 | 23.30 | 2.240000 | 12.070000 | 41.000000 | Good |
| 14871 | Visakhapatnam | 6/28/2020 | 24.380000 | 3.42 | 26.06 | 16.53 | 0.52 | 12.72 | 30.14 | 0.740000 | 2.210000 | 70.000000 | Satisfactory |
| 14872 | Visakhapatnam | 6/29/2020 | 22.910000 | 3.45 | 29.53 | 18.33 | 0.48 | 8.42 | 30.96 | 0.010000 | 0.010000 | 68.000000 | Satisfactory |
| 14873 | Visakhapatnam | 6/30/2020 | 16.640000 | 4.05 | 29.26 | 18.80 | 0.52 | 9.84 | 28.30 | 0.000000 | 0.000000 | 54.000000 | Satisfactory |
| 14874 | Visakhapatnam | 7/1/2020 | 15.000000 | 0.40 | 26.85 | 14.05 | 0.59 | 2.10 | 17.05 | 3.196163 | 10.071038 | 50.000000 | Good |
14875 rows × 13 columns
a['AQI_Bucket'].replace(np.nan,value='Moderate')
0 Moderate
1 Moderate
2 Moderate
3 Moderate
4 Moderate
...
14870 Good
14871 Satisfactory
14872 Satisfactory
14873 Satisfactory
14874 Good
Name: AQI_Bucket, Length: 14875, dtype: object
sns.regplot(x=a['CO'],y=a['AQI'])
<AxesSubplot:xlabel='CO', ylabel='AQI'>
sns.regplot(x=a['CO'],y=a['AQI'])
sns.regplot(x=a['Toluene'],y=a['AQI'])
<AxesSubplot:xlabel='Toluene', ylabel='AQI'>
g=sns.FacetGrid(a,col="AQI_Bucket",margin_titles=True)
g.map(sns.regplot,"AQI","CO",color=".3",fit_reg=False,x_jitter=.1)
<seaborn.axisgrid.FacetGrid at 0x2b915119dc0>
sns.heatmap(a.isnull(),yticklabels=False, cbar=False, cmap="Accent_r")
<AxesSubplot:>
plt.figure(figsize=(12,10))
sns.heatmap(a.corr(),annot=True, cmap='Blues')
<AxesSubplot:>
plt.figure(figsize=(20,10))
sns.scatterplot(x='City',y='AQI',data=a)
plt.show()
sns.barplot(x='AQI_Bucket',y='AQI',data=a)
plt.show()
print(a.City.unique())
['Ahmedabad' 'Bengaluru' 'Chennai' 'Coimbatore' 'Delhi' 'Hyderabad' 'Kochi' 'Kolkata' 'Mumbai' 'Visakhapatnam']
plt.figure(figsize=(20,10))
g=sns.FacetGrid(a,col="AQI_Bucket",height=4,aspect=.5)
g.map(sns.barplot,"City","AQI",color=".3",order=["Chennai","Coimbatore"])
<seaborn.axisgrid.FacetGrid at 0x2b91870cf70>
<Figure size 1440x720 with 0 Axes>
sns.histplot(x="AQI", data=a,color="Purple" )
plt.show()
sns.pairplot(a,hue="AQI")
<seaborn.axisgrid.PairGrid at 0x2b919b867f0>
sns.pairplot(a,hue="AQI_Bucket")
<seaborn.axisgrid.PairGrid at 0x2b928de1a90>
g=sns.PairGrid(a,hue="AQI_Bucket")
g.map(sns.scatterplot)
g.add_legend()
<seaborn.axisgrid.PairGrid at 0x2b9352cfd00>
sns.boxplot(data=a,x="AQI",y="AQI_Bucket")
<AxesSubplot:xlabel='AQI', ylabel='AQI_Bucket'>
a.corr()
| PM2.5 | NO | NO2 | NOx | CO | SO2 | O3 | Benzene | Toluene | AQI | |
|---|---|---|---|---|---|---|---|---|---|---|
| PM2.5 | 1.000000 | 0.582403 | 0.481107 | 0.515665 | 0.136044 | 0.204288 | 0.363608 | 0.202893 | 0.294136 | 0.558344 |
| NO | 0.582403 | 1.000000 | 0.526127 | 0.761933 | 0.274019 | 0.254245 | 0.145343 | 0.184412 | 0.319343 | 0.494988 |
| NO2 | 0.481107 | 0.526127 | 1.000000 | 0.671461 | 0.420938 | 0.589141 | 0.347931 | 0.163130 | 0.469430 | 0.632438 |
| NOx | 0.515665 | 0.761933 | 0.671461 | 1.000000 | 0.264610 | 0.337986 | 0.259243 | 0.198954 | 0.379522 | 0.512750 |
| CO | 0.136044 | 0.274019 | 0.420938 | 0.264610 | 1.000000 | 0.594205 | 0.052150 | 0.164089 | 0.412458 | 0.794315 |
| SO2 | 0.204288 | 0.254245 | 0.589141 | 0.337986 | 0.594205 | 1.000000 | 0.184961 | 0.146508 | 0.467860 | 0.639325 |
| O3 | 0.363608 | 0.145343 | 0.347931 | 0.259243 | 0.052150 | 0.184961 | 1.000000 | 0.034971 | 0.133195 | 0.277670 |
| Benzene | 0.202893 | 0.184412 | 0.163130 | 0.198954 | 0.164089 | 0.146508 | 0.034971 | 1.000000 | 0.376649 | 0.236064 |
| Toluene | 0.294136 | 0.319343 | 0.469430 | 0.379522 | 0.412458 | 0.467860 | 0.133195 | 0.376649 | 1.000000 | 0.486838 |
| AQI | 0.558344 | 0.494988 | 0.632438 | 0.512750 | 0.794315 | 0.639325 | 0.277670 | 0.236064 | 0.486838 | 1.000000 |